from algo.compute_stats import get_norm_bins, normal_test
import numpy as np
import math
from utils.format_util import cast_float
from utils.format_util import dup_name_handler


def zscore_outliers(data, thres):
    mean = np.mean(data)
    std = np.std(data)
    index = np.where(~(abs((data - mean) / std) <= thres))[0]
    return index


def run(df, params):
    results = {}
    results["data_vis"] = {}
    results["highlight"] = {}
    messages = []
    check_normal = []
    check_num = []
    check_outlier = []

    cols = eval(params.get("cols"))
    is_append = int(params.get("append"))
    action = params.get("action")
    thres = float(params.get("thres"))
    all_cols = df.columns.tolist()

    alpha = 0.05
    num_limit = 10

    if len(cols) == 0:
        messages.append("请选择特征列")
        return df, results
    for col in cols:
        result = {}
        data = df[col].values
        valid_index = np.where(np.isnan(data) == 0)[0]
        valid_data = data[valid_index]

        num = len(data)
        is_normal = normal_test(valid_data)["p"] > alpha

        hist, bin, y = get_norm_bins(valid_data)
        result["hist"] = hist.tolist()
        result["bin"] = bin.tolist()
        result["normal"] = y.tolist()
        result = cast_float(result)

        if not is_normal:
            check_normal.append(col)
        if num < num_limit:
            check_num.append(col)

        if is_normal and num >= num_limit:
            index = zscore_outliers(valid_data, thres)
            index = valid_index[index]
            outliers = data[index].tolist()
            outlier_bins = []
            bin = bin.tolist()
            bin.append(math.inf)
            for i in range(len(bin)):
                for outlier in outliers:
                    if bin[i] <= outlier < bin[i + 1]:
                        outlier_bins.append(bin[i])
            result["outlier_bins"] = outlier_bins

            result = cast_float(result)
            result["outlier_num"] = len(outliers)
            result["outlier_percent"] = round(len(outliers) / num * 100, 5)
            if len(outliers) > 0:
                results["highlight"][col] = list(set(outliers))
            else:
                check_outlier.append(col)

            if action == "replace_null" and len(index) > 0:
                data = data.astype(float)
                data[index] = None
            if action != "no" and is_append:
                new_col = "_".join([col, "统计异常检测"])
                new_col = dup_name_handler(new_col, all_cols)

                col_index = all_cols.index(col) + 1
                all_cols.insert(col_index, new_col)
                df = df.reindex(columns=all_cols)
                df[new_col] = data
            else:
                df[col] = data
            if action == "delete":
                df.drop(index, inplace=True)
        results["data_vis"][col] = result

    if len(check_normal) > 0:
        messages.append("以下特征列不符合正态分布： " + ",\t".join(check_normal))
    if len(check_num) > 0:
        messages.append("以下特征列样本量小于10： " + ",\t".join(check_num))
    if len(check_outlier) > 0:
        messages.append("以下特征列无异常值： " + ",\t".join(check_outlier))
    if len(messages) > 0:
        results["message"] = "; ".join(messages)
    return df, results
